It turns out that words like the n-word and TERF are appearing very frequently in the holyoke and smith confessional respectively. We want to dig deeper into how these and other controversial words are being used.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import functools
from os import path
from scipy.ndimage import imread
from nltk.util import ngrams
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from IPython.display import display
import cufflinks as cf
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
plt.style.use('ggplot')
%matplotlib inline
# Reading in data
holyc_df = pd.read_csv('../tmp/clean/holyokecon_confessional_comments.csv')
holyr_df = pd.read_csv('../tmp/clean/holyokecon_confessional_reports.csv')
holys_df = pd.read_csv('../tmp/clean/holyokecon_confessional_secrets.csv')
smithc_df = pd.read_csv('../tmp/clean/smithcon_confessional_comments.csv')
smithr_df = pd.read_csv('../tmp/clean/smithcon_confessional_reports.csv')
smiths_df = pd.read_csv('../tmp/clean/smithcon_confessional_secrets.csv')
# defining some global variables
SECRET_COL = 'clean_tokens_secret'
REPORT_COL = 'clean_tokens_report'
holysr_df = holys_df.merge(holyr_df, left_on='id', right_on="secret_id",
how='left', suffixes=('_secret', '_report'))
#preprocess: remove rows with null clean_tokens_secret value
holysr_df = holysr_df[holysr_df[TEXT_COL].notnull()]
holysr_df.head()
# detecting secrets containing a specific word
pattern = r'gay|lesbian|trans|bisex' # niggar|nigger|asian|yellow|latino|white|gay|lesbian|trans|bi
selector = holysr_df[TEXT_COL].str.contains(pattern)
match_df = holysr_df[selector]
# Drop duplicate secrets
match_secrets = match_df.drop_duplicates('id_secret')
# Match not reported
match_not_reported = match_secrets[match_secrets['id_report'].isnull()]
# Match reported
match_reported = match_secrets[match_secrets['id_report'].notnull()]
# Select report text
report_text = match_df[match_df[REPORT_COL].notnull()]
word_cloud_options = {
'width': 800,
'height': 800,
'background_color': "white",
'max_words': 500,
'stopwords': STOPWORDS,
'random_state': 42
}
def create_word_cloud(text_iterable, image_color_fp=None,
title='', **kwargs):
confesh_coloring = imread(image_color_fp)
kwargs.update({'mask': confesh_coloring})
wc = WordCloud(**kwargs)
text = " ".join(text_iterable)
wc.generate(text)
image_colors = ImageColorGenerator(confesh_coloring)
plt.figure(figsize=(8,8))
plt.title(title)
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.show()
logo_fp = '../assets/logo2.png'
# Word Cloud of Match
create_word_cloud(match_secrets[SECRET_COL].astype(str),
logo_fp, title="Holyoke Secrets Containing the word %s" % pattern,
**word_cloud_options)
# Defining functions to compute word frequency
def word_counter(text, n=1, length_thres=50):
t = text.split()
t = [tk for tk in t if len(tk) < length_thres]
for i in range(n):
t_ngrams = [" ".join(b) for b in list(ngrams(t, i + 1))]
t.extend(t_ngrams)
return Counter(t)
def word_aggregater(corpus_list, n=1):
c = Counter()
for doc in corpus_list:
c.update(word_counter(doc, n=n))
return c
def count_token_frequency(token_series, filter_thres, **kwargs):
freq_df = pd.DataFrame(word_aggregater(token_series, **kwargs).items())
freq_df.rename(columns={0: 'word', 1: 'frequency'}, inplace=True)
freq_df = freq_df[freq_df['frequency'] > filter_thres] \
.sort_values('frequency', ascending=False)
freq_df['ngrams'] = freq_df['word'].apply(lambda x: len(x.split()))
return freq_df.reset_index(drop=True)
# create frequency count dataframes
secrets_corpus = count_token_frequency(match_secrets['clean_tokens_secret'], 5, n=2)
secrets_not_reported_corpus = count_token_frequency(match_not_reported['clean_tokens_secret'], 5, n=2)
secrets_reported_corpus = count_token_frequency(match_reported['clean_tokens_secret'], 5, n=2)
report_text_corpus = count_token_frequency(report_text['clean_tokens_secret'], 5, n=2)
# Filtering secrets by ngrams
secrets_corpus = secrets_corpus[
secrets_corpus['ngrams'] == 2
]
secrets_not_reported_corpus = secrets_not_reported_corpus[
secrets_not_reported_corpus['ngrams'] == 2
]
secrets_reported_corpus = secrets_reported_corpus[
secrets_reported_corpus['ngrams'] == 2
]
# Filtering reported and not reported vocabulary.
# Vocabulary is based on first 100 words in secrets_corpus
vocabulary = secrets_corpus['word'][:30].tolist()
vocab_filter = lambda x: True if x in vocabulary else False
secrets_nr_filtered = secrets_not_reported_corpus[
secrets_not_reported_corpus['word'].apply(vocab_filter)
]
secrets_r_filtered = secrets_reported_corpus[
secrets_reported_corpus['word'].apply(vocab_filter)
]
def create_bar_trace(dataframe, graph_obj, **go_kwargs):
return graph_obj(
x=dataframe['frequency'],
y=dataframe['word'],
**go_kwargs)
trace1 = create_bar_trace(secrets_nr_filtered, go.Bar,
name='Not Reported', orientation= 'h',
marker={'color': '#bc94d3'})
trace2 = create_bar_trace(secrets_r_filtered, go.Bar,
name='Reported', orientation= 'h',
marker={'color': '#8551a3'})
data = [trace1, trace2]
layout = go.Layout(
margin={
'l': 125
},
barmode='stack',
width=700,
height=700,
legend={
'yanchor': 'top',
'traceorder': "normal",
'xanchor': "left",
'borderwidth': 0,
'y': 1.20,
'x': 0,
'font': {
'color': "",
'family': "",
'size': 18
},
},
yaxis={
'autorange':'reversed'
},
xaxis={
'mirror': True,
'side': "top"
}
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
search = raw_input("What do you want to search for?")
print search